\subsection{Independence of events}
\begin{definition}
	Let \((\Omega, \mathcal F, \mathbb P)\) be a probability space.
	Let \(A, B \in \mathcal F\).
	\(A\) and \(B\) are called independent if
	\[
		\prob{A \cap B} = \prob{A} \cdot \prob{B}
	\]
	We write \(A \perp B\), or \(A \perp\!\!\!\perp B\).
	A countable collection of events \((A_n)\) is said to be independent if for all distinct \(i_1, \dots, i_k\), we have
	\[
		\prob{A_{i_1} \cap \dots \cap A_{i_k}} = \prod_{j=1}^k \prob{A_{i_j}}
	\]
\end{definition}
\begin{remark}
	To show that a collection of events is independent, it is insufficient to show that events are pairwise independent.
	For example, consider tossing a fair coin twice, so \(\Omega = \{ (0, 0), (0, 1), (1, 0), (1, 1) \}\).
	\(\prob{\{\omega\}} = \frac{1}{4}\).
	Consider the events \(A, B, C\) where
	\[
		A = \{ (0, 0), (0, 1) \};\quad B = \{ (0, 0), (1, 0) \};\quad C = \{ (1, 0), (0, 1) \}
	\]
	\[
		\prob{A} = \prob{B} = \prob{C} = \frac{1}{2}
	\]
	\begin{align*}
		\prob{A \cap B} = \prob{\{ (0, 0) \}} & = \frac{1}{4} = \prob{A} \cdot \prob{B} \\
		\prob{A \cap C} = \prob{\{ (0, 1) \}} & = \frac{1}{4} = \prob{A} \cdot \prob{C} \\
		\prob{B \cap C} = \prob{\{ (1, 0) \}} & = \frac{1}{4} = \prob{B} \cdot \prob{C}
	\end{align*}
	\[
		\prob{A \cap B \cap C} = \prob{\varnothing} = 0
	\]
\end{remark}
\begin{claim}
	If \(A \perp B\), then \(A \perp \stcomp{B}\).
\end{claim}
\begin{proof}
	\begin{align*}
		\prob{A \cap \stcomp{B}} & = \prob{A} - \prob{A \cap B}           \\
		                         & = \prob{A} - \prob{A} \cdot \prob{B}   \\
		                         & = \prob{A} \left( 1 - \prob{B} \right) \\
		                         & = \prob{A} \prob{\stcomp{B}}
	\end{align*}
	as required.
\end{proof}

\subsection{Conditional probability}
\begin{definition}
	Let \((\Omega, \mathcal F, \mathbb P)\) be a probability space.
	Let \(B \in \mathcal F\) with \(\prob{B} > 0\).
	We define the conditional probability of \(A\) given \(B\), written \(\prob{A \mid B}\), as
	\[
		\prob{A \mid B} = \frac{\prob{A \cap B}}{\prob{B}}
	\]
\end{definition}
Note that if \(A\) and \(B\) are independent, then
\[
	\prob{A \mid B} = \frac{\prob{A \cap B}}{\prob{B}} = \frac{\prob{A} \cdot \prob{B}}{\prob{B}} = \prob{A}
\]
\begin{claim}
	Suppose that \((A_n)\) is a disjoint sequence in \(\mathcal F\).
	Then
	\[
		\prob{\bigcup A_n \mathrel{\Big|} B} = \sum_{n} \prob{A_n \mid B}
	\]
	This is the countable additivity property for conditional probability.
\end{claim}
\begin{proof}
	\begin{align*}
		\prob{\bigcup A_n \mathrel{\Big|} B} & = \frac{\prob{\qty(\bigcup A_n) \cap B}}{\prob{B}} \\
		                                     & = \frac{\prob{\bigcup (A_n \cap B)}}{\prob{B}} \\
		\intertext{By countable additivity, since the \((A_n \cap B)\) are disjoint,}
		                                     & = \sum_n \frac{\prob{A_n \cap B}}{\prob{B}}    \\
		                                     & = \sum_n \prob{A_n \mid B}
	\end{align*}
\end{proof}
We can think of \(\prob{\wildcard \mid B}\) as a new probability measure for the same \(\Omega\).

\subsection{Law of total probability}
\begin{claim}
	Suppose \((B_n)\) is a disjoint collection of events in \(\mathcal F\), such that \(\bigcup B = \Omega\), and for all \(n\), we have \(\prob{B_n} > 0\).
	If \(A \in \mathcal F\) then
	\[
		\prob{A} = \sum_n \prob{A \mid B_n} \cdot \prob{B_n}
	\]
\end{claim}
\begin{proof}
	\begin{align*}
		\prob{A} & = \prob{A \cap \Omega}                   \\
		         & = \prob{A \cap \left(\bigcup B_n\right)} \\
		         & = \prob{\bigcup(A \cap B_n)}             \\
		\intertext{By countable additivity,}
		         & = \sum_n \prob{A \cap B_n}               \\
		         & = \sum_n \prob{A \mid B_n} \prob{B_n}
	\end{align*}
\end{proof}

\subsection{Bayes' formula}
\begin{claim}
	Suppose \((B_n)\) is a disjoint sequence of events with \(\bigcup B_n = \Omega\) and \(\prob{B_n} > 0\) for all \(n\).
	Then
	\[
		\prob{B_n \mid A} = \frac{\prob{A \mid B_n} \prob{B_n}}{\sum_k \prob{A \mid B_k} \prob{B_k}}
	\]
\end{claim}
\begin{proof}
	\begin{align*}
		\prob{B_n \mid A} & = \frac{\prob{B_n \cap A}}{\prob{A}}                                       \\
		                  & = \frac{\prob{A \mid B_n} \prob{B_n}}{\prob{A}}                            \\
		\intertext{By the law of total probability,}
		                  & = \frac{\prob{A \mid B_n} \prob{B_n}}{\sum_k \prob{A \mid B_k} \prob{B_k}}
	\end{align*}
\end{proof}
Note that on the right hand side, the numerator appears somewhere in the denominator.
This formula is the basis of Bayesian statistics.
It allows us to reverse the direction of a conditional probability---knowing the probabilities of the events \((B_n)\), and given a model of \(\prob{A \mid B_n}\), we can calculuate the posterior probabilities of \(B_n\) given that \(A\) occurs.

\subsection{Bayes' formula for medical tests}
Consider the probability of getting a false positive on a test for a rare condition.
Suppose 0.1\% of the population have condition \(A\), and we have a test which is positive for 98\% of the affected population, and 1\% of those unaffected by the disease.
Picking an individual at random, what is the probability that they suffer from \(A\), given that they have a positive test?

We define \(A\) to be the set of individuals suffering from the condition, and \(P\) is the set of individuals testing positive.
Then by Bayes' formula,
\[
	\prob{A \mid P} = \frac{\prob{P \mid A}\prob{A}}{\prob{P \mid A}\prob{A} + \prob{P \mid \stcomp{A}}\prob{\stcomp{A}}} = \frac{0.98 \cdot 0.001}{0.98 \cdot 0.001 + 0.01 \cdot 0.999} \approx 0.09 = 9\%
\]
Why is this so low?
We can rewrite this instance of Bayes' formula as
\[
	\prob{A \mid P} = \frac{1}{1 + \frac{\prob{P \mid \stcomp{A}}\prob{\stcomp{A}}}{\prob{P \mid A}\prob{A}}}
\]
Here, \(\prob{\stcomp{A}} \approx 1, \prob{P \mid A} \approx 1\).
So
\[
	\prob{A \mid P} \approx \frac{1}{1 + \frac{\prob{P \mid \stcomp{A}}}{\prob{A}}}
\]
So this is low because \(\prob{P \mid \stcomp{A}} \gg \prob{A}\).
Suppose that there is a population of 1000 people and about 1 suffers from the disease.
Among the 999 not suffering from \(A\), about 10 will test positive.
So there will be about 11 people who test positive, and only 1 out of 11 (9\%) of those actually has the disease.

\subsection{Probability changes under extra knowledge}
Consider these three statements:
\begin{enumerate}[(a)]
	\item I have two children, (at least) one of whom is a boy.
	\item I have two children, and the eldest one is a boy.
	\item I have two children, one of whom is a boy born on a Thursday.
\end{enumerate}
What is the probability that I have two boys, given \(a\), \(b\) or \(c\)?
Since no further information is given, we will assume that all outcomes are equally likely.
We define:
\begin{itemize}
	\item \(BG\) is the event that the elder sibling is a boy, and the younger is a girl;
	\item \(GB\) is the event that the elder sibling is a girl, and the younger is a boy;
	\item \(BB\) is the event that both children are boys; and
	\item \(GG\) is the event that both children are girls.
\end{itemize}
Now, we have
\begin{enumerate}[(a)]
	\item \(\prob{BB \mid BB \cup BG \cup GB} = \frac{1}{3}\)
	\item \(\prob{BB \mid BB \cup BG} = \frac{1}{2}\)
	\item Let us define \(GT\) to be the event that the elder sibling is a girl, and the younger is a boy born on a Thursday, and define \(TN\) to be the event that the elder sibling is a boy born on a Thursday and the younger is a boy not born on a Thursday, and other events are defined similarly.
	      So
	      \begin{align*}
		      \prob{TT \cup TN \cup NT \mid GT \cup TG \cup TT \cup TN \cup NT} & = \frac{\prob{TT \cup TN \cup NT}}{\prob{GT \cup TG \cup TT \cup TN \cup NT}}                                                                                                                                                                                \\
		                                                                        & = \frac{\frac{1}{2}\frac{1}{7}\frac{1}{2}\frac{1}{7} + 2 \cdot \frac{1}{2}\frac{1}{7}\frac{1}{2}\frac{6}{7}}{2\cdot \frac{1}{2}\frac{1}{2}\frac{1}{7} + \frac{1}{2}\frac{1}{7}\frac{1}{2}\frac{1}{7} + 2 \cdot \frac{1}{2}\frac{1}{7}\frac{1}{2}\frac{6}{7}} \\
		                                                                        & = \frac{13}{27} \approx 48\%
	      \end{align*}
\end{enumerate}

\subsection{Simpson's paradox}
Consider admissions by men and women from state and independent schools to a university given by the tables
\begin{center}
	\begin{tabular}{c c c c}
		All applicants & Admitted & Rejected & \% Admitted \\ \midrule
		State          & 25       & 25       & 50\%        \\
		Independent    & 28       & 22       & 56\%        \\
	\end{tabular}
\end{center}
\begin{center}
	\begin{tabular}{c c c c}
		Men only    & Admitted & Rejected & \% Admitted \\ \midrule
		State       & 15       & 22       & 41\%        \\
		Independent & 5        & 8        & 38\%        \\
	\end{tabular}
\end{center}
\begin{center}
	\begin{tabular}{c c c c}
		Women only  & Admitted & Rejected & \% Admitted \\ \midrule
		State       & 10       & 3        & 77\%        \\
		Independent & 23       & 14       & 62\%        \\
	\end{tabular}
\end{center}
This is seemingly a paradox; both women and men are more likely to be admitted if they come from a state school, but when looking at all applicants, they are more likely to be admitted if they come from an independent school.
This is called Simpson's paradox; it arises when we aggregate data from disparate populations.
Let \(A\) be the event that an individual is admitted, \(B\) be the event that an individual is a man, and \(C\) be the event that an individual comes from a state school.
We see that
\begin{align*}
	\prob{A \mid B \cap C}          & > \prob{A \mid B \cap \stcomp{C}}          \\
	\prob{A \mid \stcomp{B} \cap C} & > \prob{A \mid \stcomp{B} \cap \stcomp{C}} \\
	\prob{A \mid C}                 & < \prob{A \mid \stcomp{C}}
\end{align*}
First, note that
\begin{align*}
	\prob{A \mid C} & = \prob{A \cap B \mid C} + \prob{A \cap \stcomp{B} \mid C}                                                                           \\
	                & = \frac{\prob{A \cap B \cap C}}{\prob{C}} + \frac{\prob{A \cap \stcomp{B} \cap C}}{\prob{C}}                                         \\
	                & = \frac{\prob{A \mid B \cap C} \prob{B \cap C}}{\prob{C}} + \frac{\prob{A \mid \stcomp{B} \cap C}\prob{\stcomp{B} \cap C}}{\prob{C}} \\
	                & = \prob{A \mid B \cap C} \prob{B \mid C} + \prob{A \mid \stcomp{B} \cap C} \prob{\stcomp{B} \mid C}                                  \\
	                & > \prob{A \mid B \cap \stcomp{C}} \prob{B \mid C} + \prob{A \mid \stcomp{B} \cap \stcomp{C}} \prob{\stcomp{B} \mid C}
\end{align*}
Let us also assume that \(\prob{B \mid C} = \prob{B \mid \stcomp{C}}\).
Then
\begin{align*}
	\prob{A \mid C} & > \prob{A \mid B \cap \stcomp{C}} \prob{B \mid \stcomp{C}} + \prob{A \mid \stcomp{B} \cap \stcomp{C}} \prob{\stcomp{B} \mid \stcomp{C}} \\
	                & = \prob{A \mid \stcomp{C}}
\end{align*}
So we needed to further assume that \(\prob{B \mid C} = \prob{B \mid \stcomp{C}}\) in order for the `intuitive' result to hold.
The assumption was not valid in the example, so the result did not hold.
